# library preparations
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import joblib
import seaborn as sns
import time
import os
# export PATH=/usr/local/texlive/2021/bin/x86_64-linux/xelatex:$PATH
# jupyter nbconvert RandomForest.ipynb --to pdf
print(os.environ['PATH'])
/home/samsoup/anaconda3/bin:/home/samsoup/anaconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
Given that we could just ask the simulation for more samples, I decided to NOT employ a typical training-split. Ratther, I will deploy all samples for training and optimize based on cross validation approaches, given every test record a turn at being in the training set and test set. Once the model is fitted, we can then ask the simulation for some more samples (say 1000) to use as a test set completely related here.
Many metrics can be used to evaluate models, some I calculate here are:

Sometimes, in the realm of health sciences, we want to punish or reward the model for doing something really well compared to other. For example, in cancer prediction, more emphasis is implaced on avoiding False Negatives (not detecting the cancer), so that we may wish to assign costs/weights (negative means reward) to TP, FP, TN, and FN like this:

This can be implemented as other alternative to above metrics during cross validation. Or, cost matrix can be used to classify one particular record. That is, we can use cost matrix to evaluate risk
With a RandomForest, I am able to extract the probability of a sample as showing SAS or not, then say:
P(SAS) = 0.2
P(neutral, other) = 0.8
Given the above cost matrix, then when I:
load_train = np.load('./data/train.npz', allow_pickle=True)
load_test = np.load('./data/test.npz', allow_pickle=True)
X_train, y_train = load_train['X_train'], load_train['y_train']
X_test, y_test = load_test['X_test'], load_test['y_test']
samples, rows, cols = X_train.shape
print("Before flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
Before flattening: (10000, 160, 2) (10000, 1) (1000, 160, 2) (1000, 1)
"""
fst should range anywhere between 0 and 1 in practice,
and mean squared error is in the same bounds.
But whether we want equal sized bins or not is still not clear
(we may want higher granularity for bins with high fst and good fit,
and low for low fst regardless of fit)
"""
# check ranges for f_st
print("Min Fst", np.amin(X_train[:, :, 0]))
print("Max Fst", np.amax(X_train[:, :, 0]))
# check ranges for means squared error
print("Min SSE", np.amin(X_train[:, :, 1]))
print("Max SSE", np.amax(X_train[:, :, 1]))
Min Fst 0.0 Max Fst 1.0 Min SSE 0.0 Max SSE 3.5890404752572502
sns.histplot(X_train[:, :, 0].reshape(-1), binwidth=0.05)
plt.title("Fst Distribution")
plt.show()
sns.histplot(X_train[:, :, 1].reshape(-1), binwidth=0.05)
plt.title("Mean Squared Error Distribution; No Limit")
plt.show()
sns.histplot(X_train[:, :, 1].reshape(-1), binwidth=0.05)
plt.title("Mean Squared Error Distribution; Limit [0, 1]")
plt.xlim([0, 1])
plt.show()
# ~ 1% of metrics have MSE >= 1,
print(sum(X_train[:, :, 1].reshape(-1) >= 1))
# ~3% of metrics have MSE >= 0.6
print(sum(X_train[:, :, 1].reshape(-1) >= 0.6))
# ~6% of metrics have MSE >= 0.4
print(sum(X_train[:, :, 1].reshape(-1) >= 0.4))
15905 49943 98852
Fst_bins = list(i / 100 for i in range(0, 100, 5))
# Fst_bins = [0, 0.4]
Fst_bins.append(np.amax(X_train[:, :, 0]) + 0.01)
print("Fst bins", Fst_bins)
MSE_bins = list(i / 10 for i in range(0, 5, 1))
# MSE_bins = [0, 0.4]
MSE_bins.append(np.amax(X_train[:, :, 1]) + 0.01)
print("MSE bins", MSE_bins)
def makeLabels(iterable):
# takes an iterable of bin (start, stop) and make labels
return ["-".join([str(start), str(round(stop, 2))]) for start, stop in zip(iterable, iterable[1:]) ]
Fst_labels = makeLabels(Fst_bins)
MSE_labels = makeLabels(MSE_bins)
print("Fst labels", Fst_labels)
print("MSE labels", MSE_labels)
def assignBins(Fst_bins, MSE_bins, X_train, X_test):
Fst_assignments = np.digitize(X_train[:, :, 0], Fst_bins)
Fst_bins_count = len(Fst_bins)
MSE_assignments = np.digitize(X_train[:, :, 1], MSE_bins)
MSE_bins_count = len(MSE_bins)
# rows: Fst bin #, cols: MSE bin
X_train_bins = np.zeros((Fst_bins_count-1, MSE_bins_count-1), dtype=np.int)
X_test_bins = np.zeros((Fst_bins_count-1, MSE_bins_count-1), dtype=np.int)
def computeMatrix(base, matrix):
for i in range(0, len(matrix)):
sample = matrix[i]
rows = sample.shape[0]
# transform this (160, 2) matrix into a fxm bin matrix
empty = np.zeros((Fst_bins_count-1, MSE_bins_count-1), dtype=np.int)
for j in range(0, rows):
Fst_bin = Fst_assignments[i, j]
MSE_bin = MSE_assignments[i, j]
empty[Fst_bin-1, MSE_bin-1] += 1
base = np.concatenate((base, empty))
base = base.reshape(-1, Fst_bins_count-1, MSE_bins_count-1)
# drop the first entry (only used for appending)
base = base[1:, :, :]
return base
X_train_bins = computeMatrix(X_train_bins, X_train)
X_test_bins = computeMatrix(X_test_bins, X_test)
return X_train_bins, X_test_bins
X_train_bins, X_test_bins = assignBins(Fst_bins, MSE_bins, X_train, X_test)
print(X_train_bins.shape)
print(X_test_bins.shape)
Fst bins [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.01] MSE bins [0.0, 0.1, 0.2, 0.3, 0.4, 3.59904047525725] Fst labels ['0.0-0.05', '0.05-0.1', '0.1-0.15', '0.15-0.2', '0.2-0.25', '0.25-0.3', '0.3-0.35', '0.35-0.4', '0.4-0.45', '0.45-0.5', '0.5-0.55', '0.55-0.6', '0.6-0.65', '0.65-0.7', '0.7-0.75', '0.75-0.8', '0.8-0.85', '0.85-0.9', '0.9-0.95', '0.95-1.01'] MSE labels ['0.0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-3.6'] (10000, 20, 5) (1000, 20, 5)
def generateHeatmaps(matrices, Fst_labels, MSE_labels, SAS_labels, savePath):
for i in range(0, matrices.shape[0]):
ax = sns.heatmap(
pd.DataFrame(
matrices[i, :, :],
index=Fst_labels,
columns=MSE_labels
),
linewidths=.5)
# invert y axis to sort Fst ascending
ax.invert_yaxis()
figure = plt.gcf()
figure.set_size_inches(16, 12)
plt.title("Output " + str(i+1) + " - " + SAS_labels[i] + " SNS")
plt.ylabel("Fst")
plt.xlabel("MSE")
# save plot
saveFile = os.path.join(savePath, "output"+str(i+1)+".png")
plt.savefig(saveFile)
plt.close() # do NOT show in-line
generateHeatmaps(
# just do the first 10 to save some time
X_train_bins[:10],
Fst_labels, MSE_labels, y_train, savePath="./Images")
y_train = y_train.reshape(y_train.shape[0], )
y_test = y_test.reshape(y_test.shape[0], )
X_train_bins = X_train_bins.reshape(X_train_bins.shape[0], -1)
X_test_bins = X_test_bins.reshape(X_test.shape[0], -1)
print("After flattening:")
print(X_train_bins.shape)
print(y_train.shape)
print(X_test_bins.shape)
print(y_test.shape)
After flattening: (10000, 100) (10000,) (1000, 100) (1000,)
# Prepare for K fold cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
k = 10
# again, random_state for sake of reproducibility
kf = KFold(n_splits=k, random_state = 42, shuffle=True)
# default confusion matrix
# we can calculate TP, FP, FN, TN from this matrix at the end
matrix = pd.DataFrame(0, index=['True Yes', 'True No', ''],
columns=['Pred Yes', 'Pred No', ''])
start = time.time()
for train_index, test_index in kf.split(X_train_bins):
# identify the train and test set within each fold
X_fold_train, X_fold_test = X_train_bins[train_index], X_train_bins[test_index]
y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
# fit the model on the training set
# model = RandomForestClassifier(n_estimators=500, oob_score=True,
# min_samples_leaf = 10, max_depth = 3,
# min_samples_split = 100, max_leaf_nodes = 50,
# max_features = 'sqrt', n_jobs=8)
model = RandomForestClassifier(n_estimators=500, oob_score=True,
max_features = 'sqrt', n_jobs=8)
model.fit(X_fold_train, y_fold_train)
# predict label on validation test set, record results
y_pred = model.predict(X_fold_test)
y_prob = model.predict_proba(X_fold_test)[:,1]
# collect metrics
m = pd.crosstab(y_pred, y_fold_test, margins=True)
m.index = ['True Yes', 'True No', '']
m.columns = ['Pred Yes', 'Pred No', '']
matrix += m
end = time.time()
print('Time taken:', end - start)
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
recall = TP / (TP + FN)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t", recall)
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average Recall: \t\t\t\t\t\t", recall)
print("Average F Measures: \t\t\t\t\t\t",
(2*precision*recall) / (precision+recall))
Time taken: 14.520958423614502
Pred Yes Pred No
True Yes 2730 2078 4808
True No 2270 2922 5192
5000 5000 10000
Accuracy: 0.5652
Error Rate: 0.4348
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.5678036605657238
True Negative Rate (TNR) | Specificity 0.562788906009245
False Positive Rate (FPR) is 0.43721109399075503
False Negative Rate (FNR) is 0.4321963394342762
Average Precision: 0.546
Average Recall: 0.5678036605657238
Average F Measures: 0.5566884176182709
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# n_estimators = number of trees to bag
# Fit the model on all training samples possible
# clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None,
# random_state=None, verbose=0, warm_start=False, class_weight=None,
# ccp_alpha=0.0, max_samples=None)
# Tune: min_samples_split, max_leaf_nodes, max_depth and min_samples_leaf.
model = RandomForestClassifier(n_estimators=500, oob_score=True,
max_features = 'sqrt', n_jobs=8)
# model = RandomForestClassifier(n_estimators=500, oob_score=True,
# min_samples_leaf = 10, max_depth = 5,
# min_samples_split = 100, max_leaf_nodes = 50,
# max_features = 'sqrt', n_jobs=8)
start = time.time()
model.fit(X_train_bins, y_train)
end = time.time()
print('Time taken:', end - start)
# save the model
# joblib.dump(model, "./data/random_forest_10000.joblib")
from itertools import product
feature_labels = [
"Fst " + i + "; MSE " + j for i, j in product(Fst_labels, MSE_labels)
]
for f, imp in zip(feature_labels, model.feature_importances_):
print(f'Feature {f} importance: {imp} \t\t\t\t\t\t')
# take the top 20 most important features
feature_imp = pd.Series(
model.feature_importances_, index=feature_labels
).sort_values(ascending=False)[:20]
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.title("Top 20 Important Features In Detecting SAS")
plt.show()
Time taken: 1.4258019924163818 Feature Fst 0.0-0.05; MSE 0.0-0.1 importance: 5.22794103976727e-05 Feature Fst 0.0-0.05; MSE 0.1-0.2 importance: 0.0 Feature Fst 0.0-0.05; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.0-0.05; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.0-0.05; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.05-0.1; MSE 0.0-0.1 importance: 0.039282481929256756 Feature Fst 0.05-0.1; MSE 0.1-0.2 importance: 5.61037289082068e-05 Feature Fst 0.05-0.1; MSE 0.2-0.3 importance: 7.410830896086087e-06 Feature Fst 0.05-0.1; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.05-0.1; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.1-0.15; MSE 0.0-0.1 importance: 0.07010430539592467 Feature Fst 0.1-0.15; MSE 0.1-0.2 importance: 4.7236350829108826e-05 Feature Fst 0.1-0.15; MSE 0.2-0.3 importance: 6.135043871476765e-06 Feature Fst 0.1-0.15; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.1-0.15; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.15-0.2; MSE 0.0-0.1 importance: 0.06980797652646502 Feature Fst 0.15-0.2; MSE 0.1-0.2 importance: 0.0010325967408239285 Feature Fst 0.15-0.2; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.15-0.2; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.15-0.2; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.2-0.25; MSE 0.0-0.1 importance: 0.06859767008120432 Feature Fst 0.2-0.25; MSE 0.1-0.2 importance: 0.026487995133955843 Feature Fst 0.2-0.25; MSE 0.2-0.3 importance: 0.00015103363287626503 Feature Fst 0.2-0.25; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.2-0.25; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.25-0.3; MSE 0.0-0.1 importance: 0.0594204893030077 Feature Fst 0.25-0.3; MSE 0.1-0.2 importance: 0.052387378734165976 Feature Fst 0.25-0.3; MSE 0.2-0.3 importance: 0.00502833811632562 Feature Fst 0.25-0.3; MSE 0.3-0.4 importance: 6.617931552392309e-05 Feature Fst 0.25-0.3; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.3-0.35; MSE 0.0-0.1 importance: 0.028505113955220155 Feature Fst 0.3-0.35; MSE 0.1-0.2 importance: 0.05536124525095431 Feature Fst 0.3-0.35; MSE 0.2-0.3 importance: 0.023213543795327526 Feature Fst 0.3-0.35; MSE 0.3-0.4 importance: 0.002525297536078205 Feature Fst 0.3-0.35; MSE 0.4-3.6 importance: 0.00010651456476467329 Feature Fst 0.35-0.4; MSE 0.0-0.1 importance: 0.011445288301786857 Feature Fst 0.35-0.4; MSE 0.1-0.2 importance: 0.046528331543390616 Feature Fst 0.35-0.4; MSE 0.2-0.3 importance: 0.03297282448186787 Feature Fst 0.35-0.4; MSE 0.3-0.4 importance: 0.008497478067561709 Feature Fst 0.35-0.4; MSE 0.4-3.6 importance: 0.0011537342814615876 Feature Fst 0.4-0.45; MSE 0.0-0.1 importance: 0.0028576233864339184 Feature Fst 0.4-0.45; MSE 0.1-0.2 importance: 0.03174756244819471 Feature Fst 0.4-0.45; MSE 0.2-0.3 importance: 0.04827573274447334 Feature Fst 0.4-0.45; MSE 0.3-0.4 importance: 0.02775623883305767 Feature Fst 0.4-0.45; MSE 0.4-3.6 importance: 0.010564121237291798 Feature Fst 0.45-0.5; MSE 0.0-0.1 importance: 0.00021960169976647134 Feature Fst 0.45-0.5; MSE 0.1-0.2 importance: 0.0036164522839634877 Feature Fst 0.45-0.5; MSE 0.2-0.3 importance: 0.02030200620786342 Feature Fst 0.45-0.5; MSE 0.3-0.4 importance: 0.029936377005777993 Feature Fst 0.45-0.5; MSE 0.4-3.6 importance: 0.024301789292175316 Feature Fst 0.5-0.55; MSE 0.0-0.1 importance: 3.216363037279239e-05 Feature Fst 0.5-0.55; MSE 0.1-0.2 importance: 0.0013602347144282673 Feature Fst 0.5-0.55; MSE 0.2-0.3 importance: 0.009990780385271032 Feature Fst 0.5-0.55; MSE 0.3-0.4 importance: 0.02261881416061077 Feature Fst 0.5-0.55; MSE 0.4-3.6 importance: 0.025581599109947492 Feature Fst 0.55-0.6; MSE 0.0-0.1 importance: 1.3518327373028837e-05 Feature Fst 0.55-0.6; MSE 0.1-0.2 importance: 0.00012392798172567067 Feature Fst 0.55-0.6; MSE 0.2-0.3 importance: 0.001449389526264316 Feature Fst 0.55-0.6; MSE 0.3-0.4 importance: 0.0075645659264848 Feature Fst 0.55-0.6; MSE 0.4-3.6 importance: 0.01912005976595089 Feature Fst 0.6-0.65; MSE 0.0-0.1 importance: 7.14220246122321e-06 Feature Fst 0.6-0.65; MSE 0.1-0.2 importance: 9.796100266471398e-05 Feature Fst 0.6-0.65; MSE 0.2-0.3 importance: 0.0009386832713477023 Feature Fst 0.6-0.65; MSE 0.3-0.4 importance: 0.005406464328374595 Feature Fst 0.6-0.65; MSE 0.4-3.6 importance: 0.03183127292134714 Feature Fst 0.65-0.7; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.65-0.7; MSE 0.1-0.2 importance: 1.5758284169674954e-05 Feature Fst 0.65-0.7; MSE 0.2-0.3 importance: 6.417688389115004e-05 Feature Fst 0.65-0.7; MSE 0.3-0.4 importance: 0.0007692531190936019 Feature Fst 0.65-0.7; MSE 0.4-3.6 importance: 0.014862006409830363 Feature Fst 0.7-0.75; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.7-0.75; MSE 0.1-0.2 importance: 2.3740015182917413e-06 Feature Fst 0.7-0.75; MSE 0.2-0.3 importance: 1.803117183028973e-05 Feature Fst 0.7-0.75; MSE 0.3-0.4 importance: 0.0002495504269289544 Feature Fst 0.7-0.75; MSE 0.4-3.6 importance: 0.02063872045959077 Feature Fst 0.75-0.8; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.75-0.8; MSE 0.1-0.2 importance: 0.0 Feature Fst 0.75-0.8; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.75-0.8; MSE 0.3-0.4 importance: 6.640272112612482e-06 Feature Fst 0.75-0.8; MSE 0.4-3.6 importance: 0.012886942277864948 Feature Fst 0.8-0.85; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.8-0.85; MSE 0.1-0.2 importance: 0.0 Feature Fst 0.8-0.85; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.8-0.85; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.8-0.85; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.85-0.9; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.85-0.9; MSE 0.1-0.2 importance: 0.0 Feature Fst 0.85-0.9; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.85-0.9; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.85-0.9; MSE 0.4-3.6 importance: 0.015664581334765993 Feature Fst 0.9-0.95; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.9-0.95; MSE 0.1-0.2 importance: 0.0 Feature Fst 0.9-0.95; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.9-0.95; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.9-0.95; MSE 0.4-3.6 importance: 0.0 Feature Fst 0.95-1.01; MSE 0.0-0.1 importance: 0.0 Feature Fst 0.95-1.01; MSE 0.1-0.2 importance: 0.0 Feature Fst 0.95-1.01; MSE 0.2-0.3 importance: 0.0 Feature Fst 0.95-1.01; MSE 0.3-0.4 importance: 0.0 Feature Fst 0.95-1.01; MSE 0.4-3.6 importance: 0.006262900911934678
# Extract single tree
estimator = model.estimators_[203]
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot',
feature_names = feature_labels,
class_names = ["SAS", "Neutral"],
rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
display(Image(filename = 'tree.png'))
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import cross_val_score
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import cross_val_predict
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import MinMaxScaler
# param_grid = {
# 'max_depth':list(range(35,55+1)),
# 'min_samples_leaf':list(range(8,35,2)),
# 'max_features':['sqrt', 'log2'],
# }
# clf = GridSearchCV(RandomForestClassifier(criterion='gini'), param_grid,
# scoring="accuracy", cv=5, n_jobs=8)
# clf.fit(X_train_bins, y_train)
# print(clf.best_params_)
# rf_pred = cross_val_predict(
# RandomForestClassifier(criterion='gini'),
# X=X_train_bins, y=y_train, cv=5, n_jobs=12)
# print("The average nested cross validation accuracy is ", accuracy_score(y_train, rf_pred))
# print(confusion_matrix(y_train, rf_pred))
# print(classification_report(y_train, rf_pred))
Below are peformance measures on the test set never touched during model building
y_pred = model.predict(X_test_bins)
print("Accuracy on Test Set:", accuracy_score(y_test,y_pred))
print(f'Our OOB prediction of accuracy is: {model.oob_score_ * 100}%')
# show stats
matrix = pd.crosstab(y_pred, y_test, margins=True)
matrix.index = ['True Yes', 'True No', '']
matrix.columns = ['Pred Yes', 'Pred No', '']
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t",
TP / (TP + FN))
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average F Measures: \t\t\t\t\t\t",
(2*recall*precision) / (recall + precision))
pred_prob = model.predict_proba(X_test_bins)[:, 1]
print("Average AUC: \t\t\t\t\t\t\t", roc_auc_score(y_test, pred_prob))
Accuracy on Test Set: 0.489
Our OOB prediction of accuracy is: 56.699999999999996%
Pred Yes Pred No
True Yes 238 249 487
True No 262 251 513
500 500 1000
Accuracy: 0.489
Error Rate: 0.511
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.4887063655030801
True Negative Rate (TNR) | Specificity 0.48927875243664715
False Positive Rate (FPR) is 0.5107212475633528
False Negative Rate (FNR) is 0.5112936344969199
Average Precision: 0.476
Average F Measures: 0.4822695035460993
Average AUC: 0.498712
from sklearn.metrics import precision_recall_curve
# precision recall curve for training set
y_prob = model.predict_proba(X_test_bins)[:,1]
precisions, recalls, thresholds = precision_recall_curve(
y_test, y_prob, pos_label = "Yes")
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
plt.plot(precisions, recalls, "b--", label="Precision")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="upper left")
plt.show()
# ROC Curve
# The more the area under the curve the better our classifier
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label = "Yes")
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr, "Random Forest")
plt.show()
from sklearn.metrics import roc_auc_score
print("The AUC is ", roc_auc_score(y_test, y_prob))
The AUC is 0.498712
from IPython.display import Image
for i in range(1, 11):
path = "./Images/output" + str(i) + ".png"
display(Image(filename=path))
Can try a variety of bin values, but all results in low cross validated and test accuracy...
Reason: assumption that SAS samples will have 'same regions that light up' not exactly true;
From the heatmaps above, SAS samples do NOT seem much different compared to Non-SAS samples